Author: Anthony Strittmatter
We analyse the browsing and online purchasing behaviour of households using Comscore’s web browser data. The data file browser 2006.csv contains 1,500 households that spent at least 1 US-dollar online in 2006. The variable spend is the online spending (in US-dollars) of a household. Furthermore, the data contains the browser history of households for the 1,000 most heavily trafficked websites (see the list of websites in browser-sites.txt). In particular, the data contains the percentage of time spent on specific websites from the total time spent online. Additionally, we have access to the file browser new.csv, which contains the browser history of 500 new households, but not the online spending.
Load the packages rpart, rpart.plot, grf, and DiagrammeR. Read the data sets browser_2006.csv and browser_new.csv. Generate matrices for the outcome, control, as well as id variables for both data sets.
######################## Load Packages and Data ########################
# Load packages
library(rpart)
library(rpart.plot)
library(grf)
#library(DiagrammeR)
# Load data
data_2006 <-read.csv("browser_2006.csv", sep = ",")
data_new <-read.csv("browser_new.csv", sep = ",")
# Data preparation
y_2006 <- as.matrix(data_2006[,2])
x_2006 <- as.matrix(data_2006[,c(3:ncol(data_2006))])
id_2006 <- as.matrix(data_2006[,1])
x_new <- as.matrix(data_new[,c(2:ncol(data_new))])
id_new <- as.matrix(data_new[,1])
print('Packages and data successfully loaded.')
#############################################################################
[1] "Packages and data successfully loaded."
a) How much is the average online spending in 2006?
######################## Average Spending ########################
spending <- round(mean(y_2006), digits=2)
print(paste0("In 2006, the average spending is ", spending, " US-dollars"))
####################################################################
[1] "In 2006, the average spending is 2064.6 US-dollars"
b) On which webpage is the household with id = 921 most of the time?
######################## Online Time ########################
freq <- round(x_2006[id_2006==921,x_2006[id_2006==921,] == max(x_2006[id_2006==921,])], digit = 0)
page <- names(x_2006[id_2006==921,x_2006[id_2006==921,] == max(x_2006[id_2006==921,])])
print(paste0("Household 921 is most of the time on the webpage ", page))
print(paste0(freq, "% of the online time is the household on this webpage"))
################################################################
[1] "Household 921 is most of the time on the webpage yahoo.com" [1] "22% of the online time is the household on this webpage"
c) Generate a variable for log online spendings. Plot the cumulative distribution of online spendings and log online spendings.
######################## Log Transformation ########################
log_y_2006 = as.matrix(log(y_2006)) # take logarithm
# Cumulative Distribution of Spending
plot(ecdf(y_2006), xlab = "Spending in US-Dollars", sub = "(Truncated at 20,000 US-Dollars)",
ylab = "cdf", main = "Distribution of Spending", xlim= c(0,20000))
# Cumulative Distribution of Log Spendiung
plot(ecdf(log_y_2006), xlab = "log Spending", ylab = "cdf", main = "Distribution of Log Spending")
#######################################################################
e) Randomly partition the 2006 data into a training and estimation sample of equal size. For this purpose, generate a variable that indicates the rows that are included in the training sample (using the sample command).
######################## Training and Test Samples ########################
set.seed(1001)
# Generate variable with the rows in training data
size <- floor(0.5 * nrow(data_2006))
training_set <- sample(seq_len(nrow(data_2006)), size = size)
print('Training and test samples created.')
#############################################################################
[1] "Training and test samples created."
a) Build in the training sample a shallow tree (terminal leaves should contain at least 150 observations) with the outcome log online spendings. Plot the structure of the shallow tree.
######################## Shallow Tree ########################
# Prepare data for tree estimator
outcome <- log_y_2006[training_set]
tree_data_2006 <- data.frame(outcome, x_2006[training_set,])
# Build shallow tree
set.seed(1001)
shallow_tree <- rpart(formula = outcome ~., data = tree_data_2006, method = "anova", xval = 10,
y = TRUE, control = rpart.control(cp = 0.00002, minbucket=150))
# Note: 'minbucket=100' imposes the restriction that each terminal leave should contain at least 100 observations.
# The algorithm 'rpart' stops growing trees when either one leave has less than 100 observations or
# the MSE gain of addidng one addidtional leave is below cp=0.00002.
## Plot tree structure
rpart.plot(shallow_tree,digits=3)
# bizrate.com
# fedex.com
################################################################
b) Build in the training sample a deep tree (terminal leaves should contain at least 10 observations) with the outcome log online spendings. Plot the cross-validated MSE.
######################## Deep Tree ########################
set.seed(1001)
deep_tree <- rpart(formula = outcome ~., data = tree_data_2006, method = "anova", xval = 10,
y = TRUE, control = rpart.control(cp = 0.00002, minbucket=10))
print('Relative CV-MSE for different tree sizes')
print(deep_tree$cptable)
# Plot CV-MSE
plotcp(deep_tree)
#############################################################
[1] "Relative CV-MSE for different tree sizes"
CP nsplit rel error xerror xstd
1 0.103556002 0 1.0000000 1.0027902 0.05336615
2 0.040244878 1 0.8964440 0.9044308 0.04818024
3 0.039183891 2 0.8561991 0.8978877 0.04719138
4 0.025774260 3 0.8170152 0.8497505 0.04523399
5 0.019723677 4 0.7912410 0.8714202 0.04640616
6 0.017016657 7 0.7320699 0.8970222 0.04863031
7 0.016303949 8 0.7150533 0.9229249 0.05160440
8 0.015339271 9 0.6987493 0.9531216 0.05417941
9 0.014704083 10 0.6834101 0.9475180 0.05288525
10 0.014403657 11 0.6687060 0.9619296 0.05386769
11 0.013108272 12 0.6543023 0.9759346 0.05377475
12 0.012988528 13 0.6411941 0.9775607 0.05483416
13 0.012701805 14 0.6282055 0.9805804 0.05469393
14 0.012643794 15 0.6155037 0.9796215 0.05462458
15 0.012431081 16 0.6028599 0.9857379 0.05468154
16 0.011998919 17 0.5904288 1.0163353 0.05546118
17 0.011303778 18 0.5784299 1.0301449 0.05628081
18 0.010548451 19 0.5671261 1.0458060 0.05758861
19 0.010380515 20 0.5565777 1.0521179 0.05769230
20 0.010354163 21 0.5461972 1.0520059 0.05755906
21 0.009387774 22 0.5358430 1.0565251 0.05778878
22 0.009374683 23 0.5264552 1.0692086 0.05851306
23 0.008193833 24 0.5170806 1.0768157 0.05908235
24 0.007761662 25 0.5088867 1.0806637 0.05946672
25 0.007360830 26 0.5011251 1.0861163 0.05966284
26 0.006569858 27 0.4937642 1.1175651 0.06104068
27 0.006523933 28 0.4871944 1.1264676 0.06111235
28 0.006262560 30 0.4741465 1.1264676 0.06111235
29 0.006239112 31 0.4678840 1.1327414 0.06128695
30 0.005892330 32 0.4616448 1.1305609 0.06098554
31 0.005871218 33 0.4557525 1.1322393 0.06105560
32 0.004953488 35 0.4440101 1.1383994 0.06109590
33 0.004368496 36 0.4390566 1.1378106 0.06240649
34 0.003885572 37 0.4346881 1.1442797 0.06291137
35 0.003595647 38 0.4308025 1.1485991 0.06292907
36 0.002983498 40 0.4236112 1.1485806 0.06284348
37 0.001851946 41 0.4206277 1.1466157 0.06282476
38 0.001481869 42 0.4187758 1.1484948 0.06281477
39 0.000020000 43 0.4172939 1.1484948 0.06281477
c) Determine the optimal number of terminal leaves.
######################## Optimal Tree Size ########################
op.index <- which.min(deep_tree$cptable[, "xerror"])
op.size <- deep_tree$cptable[op.index, "nsplit"] +1
print(paste0("Optimal number final leaves: ", op.size))
#####################################################################
[1] "Optimal number final leaves: 4"
d) Prune the deep tree and plot the structure of the pruned tree.
######################## Pruned Tree ########################
# Select the Tree that Minimises CV-MSE
# Get cp-value that corresponds to optimal tree size
cp.vals <- deep_tree$cptable[op.index, "CP"]
# Prune the deep tree
pruned_tree <- prune(deep_tree, cp = cp.vals)
## Plot tree structure
rpart.plot(pruned_tree,digits=3)
# aggregateknowledge.com
################################################################
e) Calculate the $\mathbf{R^2}$ in the test sample.
######################## Out-of-Sample Performance ########################
# Predict log online spending
pred_tree <- predict(pruned_tree, newdata= as.data.frame(x_2006))
# Test sample data
outcome_test <- log_y_2006[-training_set]
pred_tree_test <- pred_tree[-training_set]
# R-squared
MSE_tree <- mean((outcome_test-pred_tree_test)^2)
r2_tree <- round(1- MSE_tree/var(outcome_test), digits = 3)
print(paste0("Test sample R-squared: ", r2_tree))
##############################################################################
[1] "Test sample R-squared: 0.107"
a) Build in the training sample a random forest to predict log online spending. The forest should contain 1000 trees. Each tree should use a 50% subsample of the training data, 1/3 of the covariates, and restrict the min.node.size to 100.
######################## Random Forest ########################
rep <- 1000 # number of trees
cov <- 1/3 # share of covariates
frac <- 1/2 # fraction of subsample
min_obs <- 100 # max. size of terminal leaves in trees
# Build Forest
set.seed(10001)
forest1 <- regression_forest(x_2006[training_set,],log_y_2006[training_set,],
mtry = floor(cov*ncol(x_2006)), sample.fraction = frac, num.trees = rep,
min.node.size = min_obs, honesty=FALSE)
print('Forest is built.')
##################################################################
[1] "Forest is built."
b) Plot a tree of the forest.
######################## Plot Example Tree ########################
# Plot a tree of the forest
# Just an illustration, overall the forest contains 1000 trees
tree <- get_tree(forest1,4) # here we select tree number 1
plot(tree)
#####################################################################
c) Plot the variable importance. Why do we have to be cautious when interpreting the variable importance?
######################## Variable Importance ########################
# Plot the variable importantance
# First we consider only first split
imp1 <- variable_importance(forest1, max.depth = 1)
print(cbind(colnames(x_2006[,imp1>0.02]),imp1[imp1>0.02]))
# Now we consider the first four splits
imp2 <- round(variable_importance(forest1, decay.exponent = 2, max.depth = 4), digits = 3)
print(cbind(colnames(x_2006[,imp2>0.02]),imp2[imp2>0.02]))
########################################################################
[,1] [,2]
[1,] "liveperson.net" "0.039"
[2,] "ups.com" "0.087"
[3,] "fedex.com" "0.142"
[4,] "jcpenney.com" "0.025"
[5,] "searchmarketing.com" "0.023"
[6,] "marriott.com" "0.033"
[7,] "bizrate.com.o01" "0.264"
[8,] "aggregateknowledge.com" "0.18"
[,1] [,2]
[1,] "liveperson.net" "0.036"
[2,] "ups.com" "0.069"
[3,] "fedex.com" "0.113"
[4,] "jcpenney.com" "0.022"
[5,] "marriott.com" "0.029"
[6,] "bizrate.com.o01" "0.22"
[7,] "aggregateknowledge.com" "0.14"
d) Use the forest to predict the online spendings in the test sample. Evaluate the performance of the random forest using the $\mathbf{R^2}$.
######################## Out-of-Sample Performance ########################
# Prediction
fit <- predict(forest1, newdata = x_2006[-training_set,])$predictions
# R-squared
SST <- mean(((log_y_2006[-training_set,])-mean((log_y_2006[-training_set,])))^2)
MSE1 <- mean(((log_y_2006[-training_set,])-fit)^2)
r2_1 <- round(1- MSE1/SST, digits = 3)
print(paste0("Test sample R-squared: ", r2_1))
#############################################################################
[1] "Test sample R-squared: 0.192"
e) Draw an area under the curve (AUC) graph with regard to the number of trees in the forest.
######################## Area Under the Curve (AUC) ########################
sizes <- c(1000,500,400,300, 200, 100, 50, 40,30,20,10, 5,4,3,2,1) # Select a grid of sample sizes
# Prepare matrix to store results
auc <- matrix(NA, nrow = length(sizes), ncol = 3)
colnames(auc) <- c("Trees", "AUC", "Marginal AUC")
auc[,1] <- sizes
# Sum of Squares Total
SST <- mean(((log_y_2006[-training_set,])-(mean(log_y_2006[-training_set,])))^2)
set.seed(10001) # set starting value
for (t in sizes){
# Estimate Forests
forest <- regression_forest(x_2006[training_set,],(log_y_2006[training_set,]), mtry = floor(cov*ncol(x_2006)),
sample.fraction = frac, num.trees = t, min.node.size = min_obs, honesty=FALSE)
fit <- predict(forest, newdata = x_2006[-training_set,])$predictions # prediction in test sample
auc[auc[,1]== t,2] <- 1- mean(((log_y_2006[-training_set,])-fit)^2)/SST # store R-squared
}
auc[,3] <- auc[,2] - rbind(as.matrix(auc[-1,2]),auc[nrow(auc),2])
# Marginal AUC
plot(auc[,1],auc[,2],type = "o",xlab="Trees", ylab= "R-squared", main = "AUC")
abline(a=0,b=0, col="red")
################################################################################
f) Build a forest with smaller min.node.size (= 5) and test if this improves the $\mathbf{R^2}$ in the test sample.
######################## Deep Forest ########################
min_obs <- 5
# Build Forest
forest2 <- regression_forest(x_2006[training_set,],log_y_2006[training_set,],
mtry = floor(cov*ncol(x_2006)), sample.fraction = frac, num.trees = rep,
min.node.size = min_obs, honesty=FALSE)
# Prediction
fit <- predict(forest2, newdata = x_2006[-training_set,])$predictions
# R-squared
SST <- mean((log_y_2006[-training_set,]-mean(log_y_2006[-training_set,]))^2)
MSE2 <- mean((log_y_2006[-training_set,]-fit)^2)
r2_2 <- round(1- MSE2/SST, digits = 3)
print(cbind(r2_1,r2_2))
# Plot tree
tree <- get_tree(forest2, 34)
plot(tree)
###############################################################
r2_1 r2_2 [1,] 0.192 0.224
g) Use the data browser new.csv, which contains the browsing behaviour of new potential customers. Predict the online spending in the new data using the prediction model that performs best in the test sample. Download the id's and the predicted spendings of the new customers in a csv-file. These predictions might help you to target marketing campaigns at the new potential customers with the highest (or lowest) expected online spending.
######################## Store Prediction for Hold-out-Sample ########################
# Hold-out-Sample Prediction
fit_new <- predict(forest2, newdata = x_new)$predictions
results <- as.matrix(cbind(id_new,fit_new)) # store ID's and predictions in oine matrix
colnames(results) <- c("id","predictions") # label columns
# Store results
write.csv(results, "predictions.csv")
print('Results for the hold-out-sample stored.')
#########################################################################################
[1] "Results for the hold-out-sample stored."
Useful links: